
**********
* Readme *
**********

* This script
* [1] pre-processes the relevant strata information for POF
* [2] pre-processes the common stratification info (SIPD)
* [3] pre-processes the map of income/expenses categories for POF
* [4] pre-processes the relevant deflator files for pnad
* [5] pre-processes interest rate data 
* [6] processes ILO data (to estimate the aggregate oaw share in the developing world)


* Root folder (PATH TO BE DEFINED BY THE USER)
**********************************************
clear all
global analysis "C:/***/replication_package"


* Timestamped log
*****************
global today = strofreal(date(c(current_date), "DMY"), "%tdYYNNDD")
log using "${analysis}/code/logs/1_3_read_aux_files_${today}.smcl", replace


************************************
* [1] Estimation domain info (POF) *
************************************

* Set working directory 
cd "${analysis}/data/"

* This file is part of the documentation bundle for POF available at:
* https://ftp.ibge.gov.br/Orcamentos_Familiares/Pesquisa_de_Orcamentos_Familiares_2017_2018/Microdados/

import excel "${analysis}/data/source_files/pof_post_strata.xlsx", sheet("Planilha1") cellrange(A6:E5507) firstrow clear

rename uf state
tostring ESTRATO_POF, generate(pof_strata)
rename pos_estrato region
rename TOTAL_PESSOAS_REFERENCIA pop_region
label var pop_region "Estimated population for geographical estimation domain"

drop COD_UPAUFSEQDV
duplicates drop state region pof_strata pop_region, force

order state region pof_strata pop_region
sort pof_strata

gen COD_UF = state

* state (categorical)
label define state 11 "RO"
label define state 12 "AC", add
label define state 13 "AM", add
label define state 14 "RR", add
label define state 15 "PA", add
label define state 16 "AP", add
label define state 17 "TO", add

label define state 21 "MA", add
label define state 22 "PI", add
label define state 23 "CE", add
label define state 24 "RN", add
label define state 25 "PB", add
label define state 26 "PE", add
label define state 27 "AL", add
label define state 28 "SE", add
label define state 29 "BA", add

label define state 31 "MG", add
label define state 32 "ES", add
label define state 33 "RJ", add
label define state 35 "SP", add

label define state 41 "PR", add
label define state 42 "SC", add
label define state 43 "RS", add

label define state 50 "MS", add
label define state 51 "MT", add
label define state 52 "GO", add
label define state 53 "DF", add

label values state state
label var state "Federative States"

* region (categorical)
label define region 111 "RO, capital"
label define region 113 "RO, remaining areas", add

label define region 121 "AC, capital", add
label define region 123 "AC, remaining areas", add

label define region 131 "AM, capital", add
label define region 132 "AM, metropolitan area", add
label define region 133 "AM, remaining areas", add

label define region 141 "RR, capital", add
label define region 143 "RR, remaining areas", add

label define region 151 "PA, capital", add
label define region 152 "PA, metropolitan area", add
label define region 153 "PA, remaining areas", add

label define region 161 "AP, capital", add 
label define region 162 "AP, metropolitan area", add
label define region 163 "AP, remaining areas", add

label define region 171 "TO, capital", add
label define region 173 "TO, remaining areas", add

label define region 211 "MA, capital", add
label define region 212 "MA, metropolitan area", add
label define region 213 "MA, remaining areas", add

label define region 221 "PI, capital", add
label define region 223 "PI, remaining areas", add

label define region 231 "CE, capital", add
label define region 232 "CE, metropolitan area", add
label define region 233 "CE, remaining areas", add

label define region 241 "RN, capital", add
label define region 242 "RN, metropolitan area", add
label define region 243 "RN, remaining areas", add

label define region 251 "PB, capital", add
label define region 252 "PB, metropolitan area", add
label define region 253 "PB, remaining areas", add

label define region 261 "PE, capital", add
label define region 262 "PE, metropolitan area", add
label define region 263 "PE, remaining areas", add

label define region 271 "AL, capital", add
label define region 272 "AL, metropolitan area", add
label define region 273 "AL, remaining areas", add

label define region 281 "SE, capital", add
label define region 282 "SE, metropolitan area", add
label define region 283 "SE, remaining areas", add

label define region 291 "BA, capital", add
label define region 292 "BA, metropolitan area", add
label define region 293 "BA, remaining areas", add

label define region 311 "MG, capital", add
label define region 312 "MG, metropolitan area", add
label define region 313 "MG, remaining areas", add

label define region 321 "ES, capital", add
label define region 322 "ES, metropolitan area", add
label define region 323 "ES, remaining areas", add

label define region 331 "RJ, capital", add
label define region 332 "RJ, metropolitan area", add
label define region 333 "RJ, remaining areas", add

label define region 351 "SP, capital", add
label define region 352 "SP, metropolitan area", add
label define region 353 "SP, remaining areas", add

label define region 411 "PR, capital", add
label define region 412 "PR, metropolitan area", add
label define region 413 "PR, remaining areas", add

label define region 421 "SC, capital", add
label define region 422 "SC, metropolitan area", add
label define region 423 "SC, remaining areas", add

label define region 431 "RS, capital", add
label define region 432 "RS, metropolitan area", add
label define region 433 "RS, remaining areas", add

label define region 501 "MS, capital", add
label define region 503 "MS, remaining areas", add

label define region 511 "MT, capital", add
label define region 512 "MT, metropolitan area", add
label define region 513 "MT, remaining areas", add

label define region 521 "GO, capital", add
label define region 522 "GO, metropolitan area", add
label define region 523 "GO, remaining areas", add

label define region 531 "DF, capital", add

label values region region
label var region "Geographical estimation domain"

describe
save "${analysis}/data/1_3_aux_strata_info.dta", replace

* Region population estimates data only
keep region pop_region
duplicates drop
rename pop_region pop_region_pof
save "${analysis}/data/1_3_aux_strata_pop.dta", replace

clear


*******************
* [2] SIPD strata *
*******************

* Define common strata identification to both surveys (as per SIPD)
* Key rule: pof strata is equivalent to pnad strata ordered by state - urban - admin - geo group - stat group


* POF survey
************
use "${analysis}/data/source_files/pof/morador.dta", clear
keep ESTRATO_POF TIPO_SITUACAO_REG V0403 V0405 ANOS_ESTUDO PESO_FINAL
tostring ESTRATO_POF, generate(pof_strata)
gen pw = PESO_FINAL / 1000
tostring TIPO_SITUACAO_REG, generate(strata_6) 
gen age = V0403
gen white = inlist(V0405, 1)
gen educ = ANOS_ESTUDO
order pof_strata strata_6 pw age white educ 

merge m:1 pof_strata using "${analysis}/data/1_3_aux_strata_info.dta", keepusing(region) nogenerate
tostring region, generate(strata_123)
gen strata_12 = substr(strata_123, 1, 2)
gen strata_3 = substr(strata_123, 3, 1)

gen survey = "pof"
keep survey pof_strata strata_12 strata_3 strata_6 pw age white educ
order survey pof_strata strata_12 strata_3 strata_6 pw age white educ
save "${analysis}/data/pof_temp.dta", replace


* PNAD survey
*************
use "${analysis}/data/1_2_pnad.dta", clear
keep if (Ano == 2017 & Trimestre == 1)
keep Estrato V2009 V2010 V1028 VD3005
tostring Estrato, generate(pnad_strata)
gen pw = V1028 / 1000
gen age = V2009
gen white = inlist(V2010, 1)
gen educ = VD3005 	
drop Estrato V2010

gen strata_12 = substr(pnad_strata, 1, 2)
label var strata_12 "state"

gen strata_3 = substr(pnad_strata, 3, 1)
label var strata_3 "capital = 1, met = 2, ride = 3 or 4, other cities = 5"


***********************************************************************************
* Due to lack of RIDE classif in POF, 2, 3 and 4 classif are adjusted

* MA: 3, 4 and 5 becomes 3 (others)
replace strata_3 = "3" if ((strata_3 == "4" | strata_3 == "5") & strata_12 == "21")

* PI: all 2 become 3 (others) 
replace strata_3 = "3" if (strata_3 == "2" & strata_12 == "22")

* MG: no need to adjust

* GO: 3, 4 and 5 becomes 3 (others)
replace strata_3 = "3" if ((strata_3 == "4" | strata_3 == "5") & strata_12 == "52")
***********************************************************************************

gen strata_45 = substr(pnad_strata, 4, 2)
label var strata_45 "groups 00 01 02 03 (if capital), 00 10 ... 60 (if not capital)"

gen strata_6 = substr(pnad_strata, 6, 1)
label var strata_6 "urban = 1, rural = 2"

gen strata_7 = substr(pnad_strata, 7, 1)
label var strata_7 "stat stratif groups 1 to 5"

keep pnad_strata strata_12 strata_3 strata_45 strata_6 strata_7 pw age white educ
order pnad_strata strata_12 strata_3 strata_45 strata_6 strata_7 pw age white educ

* Unified strata definition
gen strata_id = strata_12 + "-" + strata_3 + "-" + strata_45 + "-" + strata_6 + "-" + strata_7
label var strata_id "Common strata id for SIPD"

append using "${analysis}/data/pof_temp.dta"
replace survey = "pnad" if survey == ""

gen meta_region = strata_12 + strata_6 + strata_3
gen urban = inlist(strata_6, "1")
label define urban 0 "Rural" 1 "Urban"
label values urban urban
label var urban "Urban area"
order survey strata_id pnad_strata pof_strata meta_region strata_12 strata_3 strata_45 strata_6 strata_7 pw age white educ urban


* Matching PNAD and POF (with reality check)
********************************************
sort pof_strata pnad_strata

bysort pof_strata: egen tot_pw_pof  = sum(pw) if survey == "pof"
bysort pof_strata: egen m_age_pof   = sum(age*pw/tot_pw_pof) if survey == "pof"
bysort pof_strata: egen m_white_pof = sum(white*pw/tot_pw_pof) if survey == "pof"
bysort pof_strata: egen m_educ_pof  = sum(educ*pw/tot_pw_pof) if survey == "pof"

bysort pnad_strata: egen tot_pw_pnad  = sum(pw) if survey == "pnad"
bysort pnad_strata: egen m_age_pnad   = sum(age*pw/tot_pw_pnad) if survey == "pnad"
bysort pnad_strata: egen m_white_pnad = sum(white*pw/tot_pw_pnad) if survey == "pnad"
bysort pnad_strata: egen m_educ_pnad  = sum(educ*pw/tot_pw_pnad) if survey == "pnad"

drop pw white age educ
duplicates drop

sort meta_region pnad_strata pof_strata 

snapshot erase _all
snapshot save

keep if survey == "pof"
keep pof_strata meta_region tot_pw_pof m_age_pof m_white_pof m_educ_pof
save "${analysis}/data/pof_temp.dta", replace

snapshot restore 1

keep if survey == "pnad"
drop survey pof_strata meta_region tot_pw_pof m_age_pof m_white_pof m_educ_pof

merge 1:1 _n using "${analysis}/data/pof_temp.dta", nogenerate sorted


* Reality check
***************
order meta_region strata_id pnad_strata pof_strata strata_12 strata_3 strata_45 strata_6 strata_7 tot_pw_pnad tot_pw_pof m_age_pnad m_age_pof m_white_pnad m_white_pof m_educ_pnad m_educ_pof


* Save and clean temp files
***************************
keep strata_id pnad_strata pof_strata urban
merge m:1 pof_strata using "${analysis}/data/1_3_aux_strata_info.dta", nogenerate
sort strata_id
save "${analysis}/data/1_3_aux_strata_info.dta", replace
erase "${analysis}/data/pof_temp.dta"


*********************************************
* [3] Map of POF income/expenses categories *
*********************************************

* This file combines information from the "translation" files that are part of the documentation bundle for POF, available at:
* https://ftp.ibge.gov.br/Orcamentos_Familiares/Pesquisa_de_Orcamentos_Familiares_2017_2018/Microdados/

import delimited "${analysis}/data/source_files/pof_inc_exp_categories.csv", case(preserve) clear 
keep code V9001 exp_cat inc_cat
save "${analysis}/data/1_3_aux_pof_map.dta", replace


************************
* [4] Deflators (PNAD) *
************************

* This file is part of the documentation bundle for pnad, available at:
* https://ftp.ibge.gov.br/Trabalho_e_Rendimento/Pesquisa_Nacional_por_Amostra_de_Domicilios_continua/Trimestral/Microdados/Documentacao/Deflatores.zip

import excel using "${analysis}/data/source_files/pnad_deflator_quarterly.xls", firstrow clear

* We want constant values at prices of January 2018
gen temp = Habitual if Ano == "2017" & trim == "12-01-02" 
bysort UF: egen hab_jan_18 = sum(temp)
replace Habitual = Habitual / hab_jan_18
drop temp hab_jan_18

gen temp = Efetivo if Ano == "2017" & trim == "12-01-02" 
bysort UF: egen efe_jan_18 = sum(temp)
replace Efetivo = Efetivo / efe_jan_18
drop temp efe_jan_18

* Set quarterly deflator
keep if trim == "01-02-03" | trim == "04-05-06" | trim == "07-08-09" | trim == "10-11-12"
gen month = substr(trim, 7, 8)
gen date = Ano + "-" + month

* Set state_date keys
gen state_date = UF + "-" + date

* Keep only the key and the relevant deflators
rename Habitual deflator_habitual
rename Efetivo deflator_efetivo
keep state_date deflator_habitual deflator_efetivo
order state_date deflator_habitual deflator_efetivo

* Export
save "${analysis}/data/1_3_aux_pnad_deflator_quarter.dta", replace


**********************
* [5] Interest rates *
**********************

* The time series for the relevant interest rate can be dynamically retrieved from Brazilian Central Bank's (BCB) api as follows:

clear

* Create the variables
gen str10 data = ""
gen str10 valor = ""

* Import BCB series "Month average interest rate of nonearmarked new credit operations - Households - Total"
insheetjson data valor using "https://api.bcb.gov.br/dados/serie/bcdata.sgs.25462/dados?formato=json", columns("data" "valor")

* Fix dates
gen int date = date(data, "DMY")
replace date = mofd(date)
format %tm date
label var date "Interview date"
drop data

* Fix values
gen market_rates = real(valor) / 100
label var market_rates "Monthly interest rate for personal credit"
drop valor

* Export
save "${analysis}/data/1_3_aux_market_rates.dta", replace


****************
* [6] ILO data *
****************

* ILO data from https://ilostat.ilo.org/data/
* Source id: EMP_2EMP_SEX_GEO_STE_NB_A
* Employment by sex, rural / urban areas and status in employment -- ILO modelled estimates

* Import
use "${analysis}/data/source_files/ilostat-2023-02-02.dta", clear
drop source source_label indicator indicator_label obs_status obs_status_label

* Sex = total
keep if sex == "SEX_T"
drop sex sex_label

* Focus on 2018
keep if time == "2018"
drop time

* Focus on oaw share
keep if inlist(classif2_label, "Status in employment (ICSE-93): Total", "Status in employment (ICSE-93): 3. Own-account workers", "Status in employment (ICSE-93): 1. Employees")

* Wide form
rename obs_value n_

replace classif1 = "nat" if classif1 == "GEO_COV_NAT"
replace classif1 = "urb" if classif1 == "GEO_COV_URB"
replace classif1 = "rur" if classif1 == "GEO_COV_RUR"
 
replace classif2 = "tot" if classif2_label == "Status in employment (ICSE-93): Total"
replace classif2 = "oaw" if classif2_label == "Status in employment (ICSE-93): 3. Own-account workers"
replace classif2 = "ee" if classif2_label == "Status in employment (ICSE-93): 1. Employees"

gen group = classif1 + "_" + classif2
drop classif1 classif2 classif1_label classif2_label

reshape wide n_, i(ref_area) j(group) string
order ref_area ref_area_label

* Rename big groups
replace ref_area = "WLD" if ref_area_label == "World"
replace ref_area = "LIC" if ref_area_label == "World: Low income"
replace ref_area = "LMC" if ref_area_label == "World: Lower-middle income"
replace ref_area = "UMC" if ref_area_label == "World: Upper-middle income"
replace ref_area = "HIC" if ref_area_label == "World: High income"

* Drop other data
keep if inlist(ref_area, "WLD", "LIC", "LMC", "LMC", "UMC", "HIC")

* Aggregate "World: Not High income"
set obs 6
replace ref_area = "NHIC" in 6
replace ref_area_label = "World: Not High income" in 6

* Sum all
foreach v in n_nat_ee n_nat_oaw n_nat_tot n_rur_ee n_rur_oaw n_rur_tot n_urb_ee n_urb_oaw n_urb_tot { 
    replace `v' = `v'[2] + `v'[3] + `v'[4] in 6 
}

* Key shares
gen oaw_share_nat = n_nat_oaw / n_nat_tot
gen oaw_share_urb = n_urb_oaw / n_urb_tot
gen oaw_share_rur = n_rur_oaw / n_rur_tot

gen ee_share_nat = n_nat_ee / n_nat_tot
gen ee_share_urb = n_urb_ee / n_urb_tot
gen ee_share_rur = n_rur_ee / n_rur_tot

format oaw_share_nat oaw_share_urb oaw_share_rur ee_share_nat ee_share_urb ee_share_rur %4.3f


* What is the share of own-account workers in the world (as of 2018)?
*********************************************************************

list ref_area_label oaw_share_nat, ab(16) separator(0) 


* What is the ratio of employeed to own-account workers in urban areas of rich countries (as of 2018)?
******************************************************************************************************

di n_urb_ee[5] / n_urb_oaw[5]
    
    
* End of script
***************
cap log close